Packages used to create the below figures:

library(tidyverse)
library(knitr)
library(kableExtra)
library(plotly)
library(lubridate)
library(readxl)
library(DT)
library(ggcorrplot)

Load partial volume corrected regional tau-PET data, as downloaded from ADNI:

tau.df <- read.csv("../../ADNI_Data/Raw_Data/UCBERKELEYAV1451_PVC_05_12_20.csv", stringsAsFactors = T)
tau.df$EXAMDATE <- as.Date(as.character(tau.df$EXAMDATE), format="%m/%d/%Y")

Filter tau data to contain only subjects with 2+ tau-PET scans, and omit irrelevant columns:

tau.df <- tau.df %>%
  select(-VISCODE, -update_stamp, -HEMIWM_SUVR, -BRAAK12_SUVR,
         -BRAAK34_SUVR, -BRAAK56_SUVR, -OTHER_SUVR) %>%
  select(!matches("VOLUME")) %>%
  group_by(RID) 

colnames(tau.df) <- str_replace_all(colnames(tau.df), "_SUVR", "")

As shown in Data Understanding, the ROIs are not precisely standardized to the inferior cerebellum gray matter SUVR. I will re-standardize the ROI SUVR values here:

tau.stand <- tau.df
for (i in 4:ncol(tau.stand)) {
  tau.stand[i] <- tau.stand[i]/ tau.df[4]
}
rm(tau.df)

The next step is to select brain regions a priori. I am going to stratify the cortical parcellations and subcortical segmentations based on Schöll et al. (2016) and per UCSF’s recommendations for usage of their tau-PET data. Here is the stratification across the Braak stages:

roi.braak <- read.csv("roi_braak_stages.csv") %>% mutate(ROI_Name = tolower(ROI_Name)) %>%
  mutate(Hemisphere = ifelse(str_detect(ROI_Name, "rh_|right"), "Right", "Left"))
datatable(roi.braak)

I will filter the tau-PET dataset to only include SUVR data for ROIs detailed in the above list, by first reshaping the tau-PET SUVR data from wide to long. Then, I will merge left and right hemisphere ROIs into one bilateral ROI by taking the mean SUVR.

tau.stand.roi <- tau.stand %>%
  pivot_longer(., cols=c(-RID, -VISCODE2, -EXAMDATE), names_to="ROI_Name", values_to="SUVR") %>%
  mutate(ROI_Name=tolower(ROI_Name)) %>%
  semi_join(., roi.braak) %>%
  left_join(., roi.braak) %>%
  mutate(ROI_Name = str_replace_all(ROI_Name, "right_|left_|ctx_rh_|ctx_lh_", "")) %>%
  group_by(RID, VISCODE2, EXAMDATE, ROI_Name, Braak) %>%
  summarise(SUVR = mean(SUVR, na.rm=T))

Now, I will re-shape the data back to wide to be compatible with the cognitive status data shape.

tau.stand.roi <- tau.stand.roi %>% 
  select(-Braak) %>%
  pivot_wider(id_cols=c(RID, VISCODE2, EXAMDATE), names_from="ROI_Name",
              values_from="SUVR")

str(tau.stand.roi)
## tibble [1,120 x 28] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ RID                     : int [1:1120] 21 31 31 56 56 56 59 69 69 69 ...
##  $ VISCODE2                : Factor w/ 16 levels "","bl","m108",..: 7 7 8 7 8 9 7 7 8 9 ...
##  $ EXAMDATE                : Date[1:1120], format: "2018-02-02" "2018-04-24" "2019-04-23" "2018-02-20" ...
##  $ bankssts                : num [1:1120] 1.34 1.26 1.23 1.32 1.32 ...
##  $ caudalanteriorcingulate : num [1:1120] 1.29 1.32 1.25 1.35 1.4 ...
##  $ cuneus                  : num [1:1120] 1.81 1.67 1.6 1.62 1.65 ...
##  $ entorhinal              : num [1:1120] 1.54 2.28 2.22 2.07 1.95 ...
##  $ fusiform                : num [1:1120] 1.5 1.38 1.35 1.42 1.4 ...
##  $ hippocampus             : num [1:1120] 1.56 1.52 1.59 1.29 1.36 ...
##  $ inferiortemporal        : num [1:1120] 1.67 1.48 1.45 1.61 1.61 ...
##  $ insula                  : num [1:1120] 1.14 1.22 1.18 1.21 1.18 ...
##  $ isthmuscingulate        : num [1:1120] 1.49 1.38 1.38 1.42 1.51 ...
##  $ lateraloccipital        : num [1:1120] 1.77 1.49 1.46 1.47 1.58 ...
##  $ lingual                 : num [1:1120] 1.66 1.52 1.45 1.42 1.43 ...
##  $ paracentral             : num [1:1120] 1.44 1.35 1.3 1.48 1.48 ...
##  $ parahippocampal         : num [1:1120] 1.18 1.43 1.45 1.39 1.37 ...
##  $ pericalcarine           : num [1:1120] 1.61 1.33 1.29 1.23 1.27 ...
##  $ postcentral             : num [1:1120] 1.49 1.4 1.34 1.43 1.42 ...
##  $ posteriorcingulate      : num [1:1120] 1.37 1.39 1.33 1.33 1.38 ...
##  $ precentral              : num [1:1120] 1.37 1.4 1.33 1.3 1.34 ...
##  $ precuneus               : num [1:1120] 1.43 1.4 1.39 1.43 1.48 ...
##  $ rostralanteriorcingulate: num [1:1120] 1.32 1.21 1.15 1.21 1.2 ...
##  $ superiorfrontal         : num [1:1120] 1.41 1.42 1.34 1.43 1.43 ...
##  $ superiorparietal        : num [1:1120] 1.39 1.5 1.46 1.49 1.52 ...
##  $ superiortemporal        : num [1:1120] 1.53 1.45 1.38 1.46 1.44 ...
##  $ supramarginal           : num [1:1120] 1.41 1.37 1.32 1.41 1.45 ...
##  $ temporalpole            : num [1:1120] 1.71 1.64 1.54 1.54 1.43 ...
##  $ transversetemporal      : num [1:1120] 1.41 1.46 1.37 1.28 1.21 ...
##  - attr(*, "groups")= tibble [1,120 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ RID     : int [1:1120] 21 31 31 56 56 56 59 69 69 69 ...
##   ..$ VISCODE2: Factor w/ 16 levels "","bl","m108",..: 7 7 8 7 8 9 7 7 8 9 ...
##   ..$ EXAMDATE: Date[1:1120], format: "2018-02-02" "2018-04-24" "2019-04-23" ...
##   ..$ .rows   : list<int> [1:1120] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   .. ..$ : int 46
##   .. ..$ : int 47
##   .. ..$ : int 48
##   .. ..$ : int 49
##   .. ..$ : int 50
##   .. ..$ : int 51
##   .. ..$ : int 52
##   .. ..$ : int 53
##   .. ..$ : int 54
##   .. ..$ : int 55
##   .. ..$ : int 56
##   .. ..$ : int 57
##   .. ..$ : int 58
##   .. ..$ : int 59
##   .. ..$ : int 60
##   .. ..$ : int 61
##   .. ..$ : int 62
##   .. ..$ : int 63
##   .. ..$ : int 64
##   .. ..$ : int 65
##   .. ..$ : int 66
##   .. ..$ : int 67
##   .. ..$ : int 68
##   .. ..$ : int 69
##   .. ..$ : int 70
##   .. ..$ : int 71
##   .. ..$ : int 72
##   .. ..$ : int 73
##   .. ..$ : int 74
##   .. ..$ : int 75
##   .. ..$ : int 76
##   .. ..$ : int 77
##   .. ..$ : int 78
##   .. ..$ : int 79
##   .. ..$ : int 80
##   .. ..$ : int 81
##   .. ..$ : int 82
##   .. ..$ : int 83
##   .. ..$ : int 84
##   .. ..$ : int 85
##   .. ..$ : int 86
##   .. ..$ : int 87
##   .. ..$ : int 88
##   .. ..$ : int 89
##   .. ..$ : int 90
##   .. ..$ : int 91
##   .. ..$ : int 92
##   .. ..$ : int 93
##   .. ..$ : int 94
##   .. ..$ : int 95
##   .. ..$ : int 96
##   .. ..$ : int 97
##   .. ..$ : int 98
##   .. ..$ : int 99
##   .. .. [list output truncated]
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

ADNI compiled a merged dataset containing key information from several tables, including subject demographics, selected cognitive assessment scores, and select biomarker data.

subj.info <- read.csv("../../ADNI_Data/Raw_Data/ADNIMERGE.csv", stringsAsFactors = T, na.strings="")

Filter subject demographics data to contain only the following features of interest:

subj.info <- subj.info %>% select(RID, VISCODE, AGE, PTGENDER, CDRSB, DX)

I actually can’t join the two datasets on the EXAMDATE feature, as these sometimes differ by one or two days depending on when the records were entered. Instead, I will join by the RID subject identifier and VISCODE, a visit code identifier.

full.df <- inner_join(tau.stand.roi, subj.info, by=c("RID", "VISCODE2"="VISCODE"))  %>%
  filter(!is.na(CDRSB)) %>%
  group_by(RID) %>%
  mutate(n_visits = n()) %>%
  filter(n_visits>1) %>%
  select(-n_visits)

Click to see the structure of this merged dataset:

str(full.df)
## tibble [576 x 32] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ RID                     : int [1:576] 31 31 56 56 56 69 69 69 96 96 ...
##  $ VISCODE2                : Factor w/ 30 levels "","bl","m108",..: 7 8 7 8 9 7 8 9 5 6 ...
##  $ EXAMDATE                : Date[1:576], format: "2018-04-24" "2019-04-23" "2018-02-20" "2019-01-10" ...
##  $ bankssts                : num [1:576] 1.26 1.23 1.32 1.32 1.3 ...
##  $ caudalanteriorcingulate : num [1:576] 1.32 1.25 1.35 1.4 1.21 ...
##  $ cuneus                  : num [1:576] 1.67 1.6 1.62 1.65 1.56 ...
##  $ entorhinal              : num [1:576] 2.28 2.22 2.07 1.95 1.98 ...
##  $ fusiform                : num [1:576] 1.38 1.35 1.42 1.4 1.34 ...
##  $ hippocampus             : num [1:576] 1.52 1.59 1.29 1.36 1.34 ...
##  $ inferiortemporal        : num [1:576] 1.48 1.45 1.61 1.61 1.58 ...
##  $ insula                  : num [1:576] 1.22 1.18 1.21 1.18 1.08 ...
##  $ isthmuscingulate        : num [1:576] 1.38 1.38 1.42 1.51 1.36 ...
##  $ lateraloccipital        : num [1:576] 1.49 1.46 1.47 1.58 1.59 ...
##  $ lingual                 : num [1:576] 1.52 1.45 1.42 1.43 1.38 ...
##  $ paracentral             : num [1:576] 1.35 1.3 1.48 1.48 1.41 ...
##  $ parahippocampal         : num [1:576] 1.43 1.45 1.39 1.37 1.29 ...
##  $ pericalcarine           : num [1:576] 1.33 1.29 1.23 1.27 1.22 ...
##  $ postcentral             : num [1:576] 1.4 1.34 1.43 1.42 1.34 ...
##  $ posteriorcingulate      : num [1:576] 1.39 1.33 1.33 1.38 1.25 ...
##  $ precentral              : num [1:576] 1.4 1.33 1.3 1.34 1.25 ...
##  $ precuneus               : num [1:576] 1.4 1.39 1.43 1.48 1.37 ...
##  $ rostralanteriorcingulate: num [1:576] 1.21 1.15 1.21 1.2 1.06 ...
##  $ superiorfrontal         : num [1:576] 1.42 1.34 1.43 1.43 1.35 ...
##  $ superiorparietal        : num [1:576] 1.5 1.46 1.49 1.52 1.43 ...
##  $ superiortemporal        : num [1:576] 1.45 1.38 1.46 1.44 1.37 ...
##  $ supramarginal           : num [1:576] 1.37 1.32 1.41 1.45 1.37 ...
##  $ temporalpole            : num [1:576] 1.64 1.54 1.54 1.43 1.47 ...
##  $ transversetemporal      : num [1:576] 1.46 1.37 1.28 1.21 1.06 ...
##  $ AGE                     : num [1:576] 77.7 77.7 69.6 69.6 69.6 72.9 72.9 72.9 79.6 79.6 ...
##  $ PTGENDER                : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 2 2 2 ...
##  $ CDRSB                   : num [1:576] 0 0 0.5 0.5 0 0.5 0 0.5 0 0 ...
##  $ DX                      : Factor w/ 3 levels "CN","Dementia",..: 1 1 3 3 3 3 3 3 1 1 ...
##  - attr(*, "groups")= tibble [243 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ RID  : int [1:243] 31 56 69 96 112 377 416 467 618 668 ...
##   ..$ .rows: list<int> [1:243] 
##   .. ..$ : int [1:2] 1 2
##   .. ..$ : int [1:3] 3 4 5
##   .. ..$ : int [1:3] 6 7 8
##   .. ..$ : int [1:3] 9 10 11
##   .. ..$ : int [1:3] 12 13 14
##   .. ..$ : int [1:3] 15 16 17
##   .. ..$ : int [1:2] 18 19
##   .. ..$ : int [1:2] 20 21
##   .. ..$ : int [1:2] 22 23
##   .. ..$ : int [1:2] 24 25
##   .. ..$ : int [1:2] 26 27
##   .. ..$ : int [1:2] 28 29
##   .. ..$ : int [1:3] 30 31 32
##   .. ..$ : int [1:2] 33 34
##   .. ..$ : int [1:3] 35 36 37
##   .. ..$ : int [1:2] 38 39
##   .. ..$ : int [1:4] 40 41 42 43
##   .. ..$ : int [1:2] 44 45
##   .. ..$ : int [1:3] 46 47 48
##   .. ..$ : int [1:3] 49 50 51
##   .. ..$ : int [1:3] 52 53 54
##   .. ..$ : int [1:2] 55 56
##   .. ..$ : int [1:2] 57 58
##   .. ..$ : int [1:4] 59 60 61 62
##   .. ..$ : int [1:3] 63 64 65
##   .. ..$ : int [1:2] 66 67
##   .. ..$ : int [1:4] 68 69 70 71
##   .. ..$ : int [1:2] 72 73
##   .. ..$ : int [1:4] 74 75 76 77
##   .. ..$ : int [1:3] 78 79 80
##   .. ..$ : int [1:2] 81 82
##   .. ..$ : int [1:2] 83 84
##   .. ..$ : int [1:3] 85 86 87
##   .. ..$ : int [1:2] 88 89
##   .. ..$ : int [1:2] 90 91
##   .. ..$ : int [1:3] 92 93 94
##   .. ..$ : int [1:2] 95 96
##   .. ..$ : int [1:4] 97 98 99 100
##   .. ..$ : int [1:2] 101 102
##   .. ..$ : int [1:2] 103 104
##   .. ..$ : int [1:2] 105 106
##   .. ..$ : int [1:2] 107 108
##   .. ..$ : int [1:3] 109 110 111
##   .. ..$ : int [1:3] 112 113 114
##   .. ..$ : int [1:4] 115 116 117 118
##   .. ..$ : int [1:2] 119 120
##   .. ..$ : int [1:2] 121 122
##   .. ..$ : int [1:2] 123 124
##   .. ..$ : int [1:2] 125 126
##   .. ..$ : int [1:3] 127 128 129
##   .. ..$ : int [1:3] 130 131 132
##   .. ..$ : int [1:2] 133 134
##   .. ..$ : int [1:2] 135 136
##   .. ..$ : int [1:2] 137 138
##   .. ..$ : int [1:2] 139 140
##   .. ..$ : int [1:2] 141 142
##   .. ..$ : int [1:3] 143 144 145
##   .. ..$ : int [1:2] 146 147
##   .. ..$ : int [1:2] 148 149
##   .. ..$ : int [1:3] 150 151 152
##   .. ..$ : int [1:4] 153 154 155 156
##   .. ..$ : int [1:2] 157 158
##   .. ..$ : int [1:2] 159 160
##   .. ..$ : int [1:2] 161 162
##   .. ..$ : int [1:3] 163 164 165
##   .. ..$ : int [1:2] 166 167
##   .. ..$ : int [1:3] 168 169 170
##   .. ..$ : int [1:2] 171 172
##   .. ..$ : int [1:2] 173 174
##   .. ..$ : int [1:3] 175 176 177
##   .. ..$ : int [1:2] 178 179
##   .. ..$ : int [1:2] 180 181
##   .. ..$ : int [1:3] 182 183 184
##   .. ..$ : int [1:2] 185 186
##   .. ..$ : int [1:3] 187 188 189
##   .. ..$ : int [1:3] 190 191 192
##   .. ..$ : int [1:2] 193 194
##   .. ..$ : int [1:3] 195 196 197
##   .. ..$ : int [1:4] 198 199 200 201
##   .. ..$ : int [1:2] 202 203
##   .. ..$ : int [1:2] 204 205
##   .. ..$ : int [1:2] 206 207
##   .. ..$ : int [1:2] 208 209
##   .. ..$ : int [1:2] 210 211
##   .. ..$ : int [1:2] 212 213
##   .. ..$ : int [1:2] 214 215
##   .. ..$ : int [1:2] 216 217
##   .. ..$ : int [1:2] 218 219
##   .. ..$ : int [1:4] 220 221 222 223
##   .. ..$ : int [1:2] 224 225
##   .. ..$ : int [1:4] 226 227 228 229
##   .. ..$ : int [1:2] 230 231
##   .. ..$ : int [1:2] 232 233
##   .. ..$ : int [1:3] 234 235 236
##   .. ..$ : int [1:3] 237 238 239
##   .. ..$ : int [1:2] 240 241
##   .. ..$ : int [1:2] 242 243
##   .. ..$ : int [1:4] 244 245 246 247
##   .. ..$ : int [1:3] 248 249 250
##   .. .. [list output truncated]
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
cat("\nNumber of longitudinal tau-PET scans with accompanying cognitive data: **\n",
    nrow(full.df), "**\nNumber of subjects in merged dataset: **", 
    length(unique(full.df$RID)), "**\n", "\n", sep="")

Number of longitudinal tau-PET scans with accompanying cognitive data: 576 Number of subjects in merged dataset: 243 As it turns out, only 588 of the original 593 tau-PET scans had corresponding cognitive assessments. This leaves 576 unique PET scan datapoints for 243 subjects.

Lastly, before I can perform outlier detection, I need to derive the longitudinal features upon which the prediction models will be built – namely, annual change in tau-PET SUVR and annual change in CDR-Sum of Boxes score.

annual.changes <- full.df %>%
  ungroup() %>%
  select(-AGE, -PTGENDER, -DX, -VISCODE2) %>%
  pivot_longer(cols=c(-RID, -EXAMDATE), names_to="Metric",
               values_to="Value") %>%
  group_by(RID, Metric) %>%
  summarise(n_years = as.numeric((EXAMDATE - lag(EXAMDATE, 
                                                 default=EXAMDATE[1]))/365),
            change = Value - lag(Value, default=Value[1])) %>%
  filter(n_years > 0) %>%
  mutate(Annual_Change = change/n_years) %>%
  select(-n_years, -change) %>%
  group_by(RID, Metric) %>%
  mutate(interval_num = row_number()) %>%
  pivot_wider(., id_cols=c(RID, interval_num), names_from=Metric,
              values_from=Annual_Change)
datatable(annual.changes[1:5])

Now that the datasets are merged, I can perform outlier detection. Given the multivariate nature of this dataset (i.e. multiple brain regions), I will use Cook’s Distance to estimate the relative influence of each data point in a simple multiple regression model.

cooks.distance <- cooks.distance(lm(CDRSB ~ . - RID - interval_num, data=annual.changes))
plot(cooks.distance, pch=1, cex=1)
abline(h = 4*mean(cooks.distance, na.rm = T), col = "blue")
text(x = 1:length(cooks.distance)+2, 
     y=cooks.distance, 
     labels=ifelse(cooks.distance > 4 * mean(cooks.distance, na.rm=T), 
                   names(cooks.distance),""), 
     col="blue")

All but one data point have relatively low Cook’s distance values, while data point #224 has a relatively large Cook’s distance. This suggests large residuals and leverage associated with this datapoint, which could distort model fitting and accuracy. Upon further examination of this instance:

as.data.frame(t(annual.changes[224,])) %>%
  rownames_to_column(var="Variable") %>%
  rename("Value" = "V1") %>%
  kable(., booktabs=T) %>%
  kable_styling(full_width = F)
Variable Value
RID 6039.0000000
interval_num 2.0000000
bankssts -0.2439197
caudalanteriorcingulate -1.3044195
CDRSB -0.6564748
cuneus -3.8446557
entorhinal -1.0297746
fusiform -4.3524547
hippocampus -0.6379813
inferiortemporal -3.3420989
insula -0.1532815
isthmuscingulate -4.2801964
lateraloccipital -7.1132861
lingual -4.6834738
paracentral 1.8873874
parahippocampal -0.8278863
pericalcarine -1.2536324
postcentral 1.8356621
posteriorcingulate -3.2157580
precentral -1.0355903
precuneus -1.8532021
rostralanteriorcingulate 0.1560680
superiorfrontal -0.6848381
superiorparietal -1.6307715
superiortemporal 1.3453180
supramarginal -1.0635341
temporalpole 0.1966368
transversetemporal 2.0079720

This subject exhibits very large fluctuations in tau-PET SUVR values in several brain regions for this associated time interval. Given that SUVR values typically range from 0.75-2, changes of this large magnitude is surprising, and may certainly render this data point an outlier. Fortunately, the interval_num of 2 indicates that this is the second time interval for this subject, so omitting this interval doesn’t reduce the total number of subjects in the analysis. I will remove this data point:

annual.changes <- annual.changes[-224,]

I can now finish some aspects of data exploration that depended upon refining the subject cohort as well as the features. For starters, I will visualize the correlation in annual tau change between each of the ROIs measured:

annual.roi <- annual.changes %>% ungroup() %>% select(-RID, -interval_num, -CDRSB)
roi.cor <- cor(annual.roi)
p.mat <- cor_pmat(annual.roi)

ggcorrplot(roi.cor, hc.order = TRUE, 
           outline.col = "white", p.mat = p.mat,
           insig = "blank") %>% ggplotly() %>%
  layout(autosize = F, width = 800, height = 800)